library(readxl)
library(dplyr)
library(plyr)
library(cluster)
library(ClusterR)
library(ggplot2)
library(tidyverse)
library(moments)
library(forcats)
library(gridExtra)
library(RColorBrewer)
library(rsample)
library(broom.mixed)
library(Metrics)
library(corrplot)
library(caret)
library(factoextra)
library(ggpubr)
library(rpart)
library(rpart.plot)
library(jtools)
library(devtools)
library(ggbiplot)
install_github("vqv/ggbiplot")
data <- read_excel("HR_Employee_Data.xlsx")
glimpse(data,width = getOption("width"))
## Rows: 14,999
## Columns: 11
## $ Emp_Id <chr> "IND02438", "IND28133", "IND07164", "IND30478", …
## $ satisfaction_level <dbl> 0.38, 0.80, 0.11, 0.72, 0.37, 0.41, 0.10, 0.92, …
## $ last_evaluation <dbl> 0.53, 0.86, 0.88, 0.87, 0.52, 0.50, 0.77, 0.85, …
## $ number_project <dbl> 2, 5, 7, 5, 2, 2, 6, 5, 5, 2, 2, 6, 4, 2, 2, 2, …
## $ average_montly_hours <dbl> 157, 262, 272, 223, 159, 153, 247, 259, 224, 142…
## $ time_spend_company <dbl> 3, 6, 4, 5, 3, 3, 4, 5, 5, 3, 3, 4, 5, 3, 3, 3, …
## $ Work_accident <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ left <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ promotion_last_5years <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Department <chr> "sales", "sales", "sales", "sales", "sales", "sa…
## $ salary <chr> "low", "medium", "medium", "low", "low", "low", …
sum(is.na(data))
## [1] 0
transformedData <- preProcess(data, method = "range")
data <- predict(transformedData, data)
data <- transform(data, salary = as.integer(as.factor(salary)))
data <- data%>%
select(-c(Emp_Id))
# Group data by department and summarize the count of each department
top_salaries <- data %>%
group_by(Department) %>%
dplyr::summarize(count = n()) %>%
arrange(desc(count))
# Select the top 5 job titles by count
top_5_data <- top_salaries[1:5, ]
glimpse(data)
## Rows: 14,999
## Columns: 10
## $ satisfaction_level <dbl> 0.31868132, 0.78021978, 0.02197802, 0.69230769, …
## $ last_evaluation <dbl> 0.265625, 0.781250, 0.812500, 0.796875, 0.250000…
## $ number_project <dbl> 0.0, 0.6, 1.0, 0.6, 0.0, 0.0, 0.8, 0.6, 0.6, 0.0…
## $ average_montly_hours <dbl> 0.2850467, 0.7757009, 0.8224299, 0.5934579, 0.29…
## $ time_spend_company <dbl> 0.125, 0.500, 0.250, 0.375, 0.125, 0.125, 0.250,…
## $ Work_accident <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ left <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ promotion_last_5years <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Department <chr> "sales", "sales", "sales", "sales", "sales", "sa…
## $ salary <int> 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
employment_type_count<-data%>%
group_by(Department)%>%
dplyr::summarise(count=n())%>%
mutate(Percent = paste(Department,":",round(100*count/sum(count)), "%"))%>%
mutate(prop = count / sum(count) *100)
ggplot(employment_type_count, aes(x="", y=prop, fill=Percent)) +
geom_bar(stat="identity", width=1, color="black") +
coord_polar("y", start=0) +
theme_void()
ggplot(top_5_data) +
geom_col(aes(x = reorder(Department, -count), y = count, fill = Department)) +
geom_text(aes(x = reorder(Department, -count), y = count, label = count), vjust = 1.5, colour = "white", position = position_dodge(.9), size = 5) +
ggtitle("Number of different positions in data science") +
xlab("") +
ylab("") +
theme(axis.text.x = element_text(angle = 30, size = 10, color = "black", hjust = 0.5)) +
theme(legend.position = "none")
data_cor<-data[, c(1:8, 10)]
cor <- cor(data_cor)
corrplot(cor, method = 'color')
## 5. Apply PCA
datapca <- prcomp(data_cor, scale=TRUE,center = TRUE)
#reverse the signs
datapca$rotation <- -1*datapca$rotation #eigenvectors in R point in the negative direction by default
head(datapca$rotation)
## PC1 PC2 PC3 PC4
## satisfaction_level -0.18916086 0.6084103 -0.1442392 0.002025143
## last_evaluation 0.46380764 0.3118634 -0.1548850 0.017424116
## number_project 0.55705957 0.1219683 -0.0106627 0.023291069
## average_montly_hours 0.52562028 0.1779333 -0.1097887 0.034336941
## time_spend_company 0.33377147 -0.1177003 0.4409212 -0.054714112
## Work_accident -0.06439211 0.2808987 0.4273870 0.032900192
## PC5 PC6 PC7 PC8
## satisfaction_level -0.254121334 0.32261547 -0.24348487 0.29096526
## last_evaluation -0.104336225 0.06469366 -0.52265596 -0.54805156
## number_project 0.096762382 -0.18884268 0.47347561 -0.24135584
## average_montly_hours 0.009167641 -0.25359580 -0.02333746 0.72134010
## time_spend_company -0.042899061 0.79304840 0.16023409 0.09322204
## Work_accident 0.811963652 -0.06525810 -0.25275425 0.02940217
## PC9
## satisfaction_level 0.51053109
## last_evaluation -0.27336728
## number_project 0.58880888
## average_montly_hours -0.30587487
## time_spend_company -0.11048404
## Work_accident 0.07010533
summary(datapca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.3642 1.2102 1.0304 0.9995 0.9775 0.91951 0.84196
## Proportion of Variance 0.2068 0.1627 0.1180 0.1110 0.1062 0.09394 0.07877
## Cumulative Proportion 0.2068 0.3695 0.4875 0.5985 0.7046 0.79858 0.87735
## PC8 PC9
## Standard deviation 0.79150 0.69094
## Proportion of Variance 0.06961 0.05304
## Cumulative Proportion 0.94696 1.00000
fviz_eig(datapca, addlabels=TRUE, barfill = "red")
ggbiplot(datapca,labels.size = 0,alpha=0.1,ellipse = TRUE,groups = data$left)
### 5.C. Final comments
set.seed(241)
trainIndex <- createDataPartition(data$left, p = .8, list = FALSE)
trainSet <- data[trainIndex, ]
testSet <- data[-trainIndex, ]
modelLogistic <- glm(left~.,family="binomial", data=trainSet)
summary(modelLogistic)
##
## Call:
## glm(formula = left ~ ., family = "binomial", data = trainSet)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4184 -0.6786 -0.4309 -0.1477 3.1315
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.572945 0.154168 3.716 0.000202 ***
## satisfaction_level -3.731186 0.098541 -37.864 < 2e-16 ***
## last_evaluation 0.480410 0.104601 4.593 4.37e-06 ***
## number_project -1.594227 0.117225 -13.600 < 2e-16 ***
## average_montly_hours 0.881358 0.121211 7.271 3.56e-13 ***
## time_spend_company 1.921482 0.134889 14.245 < 2e-16 ***
## Work_accident -1.528568 0.099376 -15.382 < 2e-16 ***
## promotion_last_5years -1.687754 0.280596 -6.015 1.80e-09 ***
## Departmenthr 0.145235 0.143898 1.009 0.312833
## DepartmentIT -0.205707 0.132241 -1.556 0.119816
## Departmentmanagement -0.807246 0.171708 -4.701 2.59e-06 ***
## Departmentmarketing -0.111705 0.142393 -0.784 0.432758
## Departmentproduct_mng -0.299672 0.143000 -2.096 0.036117 *
## DepartmentRandD -0.681839 0.160028 -4.261 2.04e-05 ***
## Departmentsales -0.052734 0.110742 -0.476 0.633942
## Departmentsupport 0.007872 0.118154 0.067 0.946883
## Departmenttechnical -0.019375 0.115688 -0.167 0.866992
## salary -0.027806 0.039570 -0.703 0.482242
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13163 on 11999 degrees of freedom
## Residual deviance: 10619 on 11982 degrees of freedom
## AIC: 10655
##
## Number of Fisher Scoring iterations: 5
prob <- predict(modelLogistic , testSet ,type="response")
pred <- ifelse(prob > 0.5, 1, 0)
matrixLog <- confusionMatrix(
factor(pred, levels = c(0,1)),
factor(testSet$left, levels = c(0,1))
)
matrixLog
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2111 511
## 1 170 207
##
## Accuracy : 0.7729
## 95% CI : (0.7575, 0.7878)
## No Information Rate : 0.7606
## P-Value [Acc > NIR] : 0.05851
##
## Kappa : 0.2553
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.9255
## Specificity : 0.2883
## Pos Pred Value : 0.8051
## Neg Pred Value : 0.5491
## Prevalence : 0.7606
## Detection Rate : 0.7039
## Detection Prevalence : 0.8743
## Balanced Accuracy : 0.6069
##
## 'Positive' Class : 0
##
plot_summs(modelLogistic)
### 6.C Final Comments
left_pc <- prcomp(data[,c(1:8,10)], center = TRUE, scale = TRUE)
summary(left_pc)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.3642 1.2102 1.0304 0.9995 0.9775 0.91951 0.84196
## Proportion of Variance 0.2068 0.1627 0.1180 0.1110 0.1062 0.09394 0.07877
## Cumulative Proportion 0.2068 0.3695 0.4875 0.5985 0.7046 0.79858 0.87735
## PC8 PC9
## Standard deviation 0.79150 0.69094
## Proportion of Variance 0.06961 0.05304
## Cumulative Proportion 0.94696 1.00000
screeplot(left_pc, type = "l", npcs = 5, main = "Screeplot of the first 5 PCs")
abline(h = 1, col="red", lty="longdash")
legend("topright", legend=c("Eigenvalue = 1"),
col=c("red"), lty=5, cex=0.6)
plot(left_pc$x[,1],left_pc$x[,2], xlab="PC1 (20.7%)", ylab = "PC2 (16.3%)", main = "PC1 / PC2 - plot")
set.seed(101)
km <- kmeans(data[,1:8,10], 2)
plot(left_pc$x[,1],left_pc$x[,2], xlab="PC1 (20.7%)",
ylab = "PC2 (16.3%)",
main = "PC1 / PC2 - plot",
col=km$cluster)
km$centers
## satisfaction_level last_evaluation number_project average_montly_hours
## 1 0.6135455 0.5517880 0.3577686 0.4851325
## 2 0.5679486 0.5571902 0.3610912 0.4918627
## time_spend_company Work_accident left promotion_last_5years
## 1 0.1882204 1 0.07791609 0.03503919
## 2 0.1871200 0 0.26515978 0.01893998
set.seed(102)
km <- kmeans(data[,1:8,10], 3)
plot(left_pc$x[,1],left_pc$x[,2], xlab="PC1 (20.7%)",
ylab = "PC2 (16.3%)",
main = "PC1 / PC2 - plot",
col=km$cluster)
#### k-means clustering into k=3 clusters and then visualizing the
results
table(km$cluster, data$left)
##
## 0 1
## 1 2000 121
## 2 0 3450
## 3 9428 0
km <- kmeans(data[,1:8,10], 3)
fviz_cluster(km, data = data[,1:8,10])
left_pc <- prcomp(data[,c(1:8,10)], center = TRUE, scale = TRUE)
summary(left_pc)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.3642 1.2102 1.0304 0.9995 0.9775 0.91951 0.84196
## Proportion of Variance 0.2068 0.1627 0.1180 0.1110 0.1062 0.09394 0.07877
## Cumulative Proportion 0.2068 0.3695 0.4875 0.5985 0.7046 0.79858 0.87735
## PC8 PC9
## Standard deviation 0.79150 0.69094
## Proportion of Variance 0.06961 0.05304
## Cumulative Proportion 0.94696 1.00000
distance <- dist(data[, c(1:6, 8, 10)], method = "euclidean")
cluster_data <- hclust(distance, method = "ward.D")
plot(cluster_data)
rect.hclust(cluster_data, k=3)
sm_hc <- cutree(cluster_data, 3)
clusplot(data[, c(1:6, 8, 10)], clus = sm_hc, lines = 0, shade = TRUE, color = TRUE, labels = 2, plotchar = FALSE, span = TRUE)
#plot(cluster_data)
#clusplot(data[, c(1:6, 8, 10)], clus = sm_hc, lines = 0, shade = TRUE, color = TRUE, labels = 2, plotchar = FALSE, span = TRUE)
model <- rpart(formula = left ~ ., data = trainSet)
prob2 <- predict(model , testSet ,type="vector")
levels(as.factor(prob2))
## [1] "0" "0.014986568641312" "0.046031746031746"
## [4] "0.0654205607476635" "0.0764227642276423" "0.0833333333333333"
## [7] "0.918774966711052" "0.969507427677873" "1"
levels(as.factor(testSet$left))
## [1] "0" "1"
pred2 <- ifelse(prob > 0.36, 1, 0)
matrixLog2 <- confusionMatrix(
factor(pred2, levels = c(0,1)),
factor(testSet$left, levels = c(0,1)))
matrixLog2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1949 251
## 1 332 467
##
## Accuracy : 0.8056
## 95% CI : (0.791, 0.8196)
## No Information Rate : 0.7606
## P-Value [Acc > NIR] : 1.956e-09
##
## Kappa : 0.4861
##
## Mcnemar's Test P-Value : 0.0009221
##
## Sensitivity : 0.8544
## Specificity : 0.6504
## Pos Pred Value : 0.8859
## Neg Pred Value : 0.5845
## Prevalence : 0.7606
## Detection Rate : 0.6499
## Detection Prevalence : 0.7336
## Balanced Accuracy : 0.7524
##
## 'Positive' Class : 0
##
rpart.plot(model, box.palette="BuGn", shadow.col="gray", nn=TRUE)